In [1]:
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tabulate

Load and combine the data


In [3]:
SPLITS = ['train', 'test', 'dev']

In [4]:
df = {}
for split in SPLITS:
    df[split] = pd.read_csv('../../../../data/annotations/split/' + split + '/annotations.tsv', sep = '\t', encoding = 'utf-8')

In [5]:
combined_df = pd.concat(df.values())

Aggregate the Data


In [6]:
SOURCES = ['article_blocked', 'article_random', 'user_blocked', 'user_random']

In [7]:
agg_dict = {'ns': 'first', 'sample': 'first', 'src': 'first', 'recipient': 'mean', 
            'attack': 'mean', 'aggression': 'mean'}
grouped_df = combined_df.groupby('rev_id').agg(agg_dict)

In [8]:
grouped_source_df = {}
for source in SOURCES:
    grouped_source_df[source] = grouped_df[grouped_df['src'].str.contains(source)]
grouped_source_df['total'] = grouped_df

Compute the values of interest


In [9]:
number_of_revisions = {k: len(v) for k, v in grouped_source_df.items()}

In [10]:
number_of_revisions


Out[10]:
{'article_blocked': 31421,
 'article_random': 19571,
 'total': 115737,
 'user_blocked': 46705,
 'user_random': 18040}

In [11]:
# Next compute proportion of aggressive and attacking revisions for each source
num = {}
perc = {}
for term in ['attack', 'aggression']:
    num[term] = {}
    perc[term] = {}
    for source in SOURCES + ['total']:
        num[term][source] = len(grouped_source_df[source].query('%s > 0.5' % term))
        perc[term][source] = num[term][source]/number_of_revisions[source]

In [12]:
for term in ['attack', 'aggression']:
    print(term)
    print(num[term])
    print(perc[term])


attack
{'user_random': 190, 'article_random': 150, 'user_blocked': 11147, 'total': 13575, 'article_blocked': 2088}
{'user_random': 0.010532150776053215, 'article_random': 0.00766440141024986, 'user_blocked': 0.23866823680548122, 'total': 0.11729179087068094, 'article_blocked': 0.06645237261703955}
aggression
{'user_random': 247, 'article_random': 207, 'user_blocked': 11849, 'total': 14760, 'article_blocked': 2457}
{'user_random': 0.013691796008869179, 'article_random': 0.010576873946144805, 'user_blocked': 0.2536987474574457, 'total': 0.12753052178646415, 'article_blocked': 0.07819611088125776}

Inter Annotator Agreement


In [13]:
dat = combined_df

In [14]:
dat.columns


Out[14]:
Index(['rev_id', '_worker_id', 'ns', 'sample', 'src', 'clean_diff', 'diff',
       'insert_only', 'page_id', 'page_title', 'rev_comment', 'rev_timestamp',
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression'],
      dtype='object')

In [15]:
ATTACK_COLUMNS = ['attack_bool', 'not_attack']
dat['attack_bool'] = (dat['attack'] > 0.5).apply(int)
dat['not_attack'] = 1-dat['attack_bool']
AGGRESSIVE_COLUMNS = ['aggressive_bool', 'not_aggressive']
dat['aggressive_bool'] = (dat['aggression'] > 0.5).apply(int)
dat['not_aggressive'] = 1-dat['aggressive_bool']

In [16]:
agg_dict = {'ns': 'first', 'sample': 'first', 'src': 'first', 'recipient': 'mean', 
            'attack': 'mean', 'aggression': 'mean'}
agg_dict.update(dict.fromkeys(ATTACK_COLUMNS, 'sum'))
agg_dict.update(dict.fromkeys(AGGRESSIVE_COLUMNS, 'sum'))
ia_df  = dat.groupby('rev_id').agg(agg_dict)

In [17]:
%load_ext autoreload
%autoreload 2
from krippendorf_alpha import *

In [18]:
print('Attack: ')
print(Krippendorf_alpha(ia_df, ATTACK_COLUMNS))
print('Aggression: ')
print(Krippendorf_alpha(ia_df, AGGRESSIVE_COLUMNS))


Attack: 
0.451278401328
Aggression: 
0.438842898582

In [ ]: